#!/bin/bash

# This script generates a large lexicon (241k words) from
# Saft-T and OpenSubtitles datasets. It reproduces the lexicon
# or vocabulary that we used for the OpenSAT20 competition.
# First, it gets the wordlist with Pocolm from text corpus. 
# (Or, you can specify your own wordlist)
# Next, it combine the wordlist with the words in CMU_Dict, 
# and find out the words that does not have pronunciations.
# Then, it uses G2P to get pronunciation for those new words.
# Finally, the CMU_Dict and the generated lexicon are merged,
# resulting in a vocabulary of size 241181 (and lexicon of size
# 397453).

# Begin configuration section.
word_list=
num_words=200000  # number of words in the vocab generated by pocolm
text_dir=/export/c03/pzelasko/opensat/pocolm/egs/opensat/data/text   # TODO: should be replaced with some "raw" data directory
# End configuration section

stage=0
work_dir=data/local/vocab
mkdir -p $work_dir

. ./utils/parse_options.sh
set -e -o pipefail
set -o nounset                              # Treat unset variables as an error
. ./path.sh
. ./cmd.sh

if [ $stage -le 0 ]; then
  if [ -z "$word_list" ]; then
    echo "Generating a wordlist/vocab up to $num_words words from the given corpus with pocolm..."
      
    [ -z "$KALDI_ROOT" ] && echo "$0: KALDI_ROOT is not set in path.sh" && exit 1
    pocolm=$KALDI_ROOT/tools/pocolm
  
    if [ ! -f $pocolm/scripts/train_lm.py ]; then
      echo "$0: you should install pocolm.  cd to $KALDI_ROOT/tools and run extras/install_pocolm.sh."
      exit 1
    fi
  
    pocolm_work_dir=data/local/pocolm
    mkdir -p $pocolm_work_dir
  
    $pocolm/scripts/validate_text_dir.py $text_dir
  
    echo "Getting word counts from $text_dir ..." 
    word_counts_dir="${pocolm_work_dir}/word_counts"
    $pocolm/scripts/get_word_counts.py $text_dir $word_counts_dir
  
    echo "Getting unigram weights ..."
    unigram_weights="${pocolm_work_dir}/unigram_weights"
    $pocolm/scripts/get_unigram_weights.py $word_counts_dir > $unigram_weights
  
    echo "Generating vocab ..."
    word_list="${pocolm_work_dir}/vocab_$num_words.txt"
    $pocolm/scripts/word_counts_to_vocab.py --num-words=$num_words \
        --weights=$unigram_weights $word_counts_dir | \
        tail -n +2 | awk '{print $1}' > ${word_list}.0 
  
    echo "Filtering out irregular words ..."
    python3 local/OpenSubtitles/pre_filter.py ${word_list}.0 ${word_list}.1
    python3 local/OpenSubtitles/post_process.py ${word_list}.1 ${word_list} | sort > ${word_list}.irregular

    echo "Done with vocab: $word_list"
  else
    echo "Using the vocab: $word_list"
  fi
fi


if [ $stage -le 1 ]; then
  echo "Getting CMUDict ..."
  local/safet_get_cmu_dict.sh
  # The output of the above command will be in data/local/dict_nosp/lexicon.txt

  cut -d" " -f 1 data/local/dict_nosp/lexicon.txt | sort - | uniq > ${work_dir}/vocab_cmu.txt
fi


if [ $stage -le 2 ]; then
  echo "Merging two vocabs ..."
  wc ${work_dir}/vocab_cmu.txt
  wc $word_list

  sort ${work_dir}/vocab_cmu.txt $word_list | uniq > ${work_dir}/vocab_merged.txt
  wc ${work_dir}/vocab_merged.txt

  # find the words only appearing in vocab_merged.txt, for which we will generate 
  # pronunciations using g2p
  comm -23 <(sort ${work_dir}/vocab_merged.txt) <(sort ${work_dir}/vocab_cmu.txt) \
      > ${work_dir}/vocab_diff.txt
  wc ${work_dir}/vocab_diff.txt
fi


if [ $stage -le 3 ]; then
  echo "Training a G2P model ..."
  # For this dataset (CMUDict), it takes a long time to run this training.
  # Perhaps more than 5 hours, or even 8 hours. There will be 4 iterations.
  # Besides, it seems different runs of training may result in slightly
  # different models and thus different lexicons.

  lexicon=data/local/dict_nosp/lexicon.txt
  sil=data/local/dict_nosp/silence_phones.txt
  steps/dict/train_g2p.sh --cmd "$train_cmd" --silence-phones $sil $lexicon exp/g2p
fi


if [ $stage -le 4 ]; then
  echo "Applying G2P to the new words ..."
  steps/dict/apply_g2p.sh --cmd "$train_cmd" --nj 32 \
      ${work_dir}/vocab_diff.txt exp/g2p exp/g2p/oov_lex

  echo "Merging the generated lexicon with CMUDict ..."
  # cf: https://github.com/kaldi-asr/kaldi/blob/master/egs/tedlium/s5_r2_wsj/local/prepare_dict.sh#L183-L197
  cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | \
      awk '{if (NF > 1) print $0}' > $work_dir/dict.oovs_g2p
  
  cat data/local/dict_nosp/lexicon.txt $work_dir/dict.oovs_g2p | sort | uniq \
      > $work_dir/lexicon.txt || exit 1;
  echo "Done. The resulting large lexicon is in $work_dir/lexicon.txt"
  echo "It will now replace the lexicon.txt in data/local/dict_nosp/lexicon.txt"
  mv data/local/dict_nosp/lexicon.txt data/local/dict_nosp/lexicon_cmu.txt
  cp $work_dir/lexicon.txt data/local/dict_nosp/lexicon.txt
fi

